knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE
)




# Wymagane pakiety ----
library(tm)           # Przetwarzanie tekstu
## Loading required package: NLP
library(SnowballC)    # Stemming
library(cluster)      # Klastrowanie
library(wordcloud)    # Chmury słów
## Loading required package: RColorBrewer
library(factoextra)   # Wizualizacje klastrów
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(RColorBrewer) # Kolory
library(ggplot2)      # Wykresy
library(dplyr)        # Przetwarzanie danych
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggrepel)      # Dodawania etykiet w wykresach
library(DT)           # Interaktywne tabele



# Dane tekstowe ----

# Ustaw Working Directory!
# Załaduj dokumenty z folderu
docs <- DirSource("/Users/adammnich/Desktop/zaj_7")
# W razie potrzeby dostosuj ścieżkę
# np.: docs <- DirSource("C:/User/Documents/textfolder")


# Utwórz korpus dokumentów tekstowych
corpus <- VCorpus(docs)


### Gdy tekst znajduje się w jednym pliku csv:
### data <- read.csv("file.csv", stringsAsFactors = FALSE, encoding = "UTF-8")
### corpus <- VCorpus(VectorSource(data$text))


# Korpus
inspect(corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 11
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 7838
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 5642
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3916
## 
## [[4]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2735
## 
## [[5]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 6666
## 
## [[6]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3668
## 
## [[7]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3525
## 
## [[8]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 4294
## 
## [[9]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 14991
## 
## [[10]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 14991
## 
## [[11]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 14946
# Korpus - zawartość przykładowego elementu
corpus[[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 7838
corpus[[1]][[1]][7:9]
## [1] "Yuichi is searching for a man who pushed his son, in the first-class compartment of the bullet train, but he finds Prince instead and is tasered by her."                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [2] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
## [3] "Ladybug steals the briefcase from Lemon and Tangerine but is attacked by another assassin (this time from Mexico), the \"Wolf\" (Benito A. Martínez Ocasio) (a Mexican assassin and former kingpin of a drug cartel), who blames Ladybug for fatally poisoning his entire wedding party, including his newlywed wife (whom Ladybug had actually saved). Ladybug was about to get off the train at the next station, but was met by Wolf at the door, who attacks Ladybug. Ladybug doesn't recognize the Wolf and denies having poisoned his entire wedding party. But Ladybug was present at the party as a waiter."
corpus[[1]][2]
## $meta
##   author       : character(0)
##   datetimestamp: 2025-04-03 15:54:01.72640109062195
##   description  : character(0)
##   heading      : character(0)
##   id           : Action__Bullet_Train2022.txt
##   language     : en
##   origin       : character(0)
# 1. Przetwarzanie i oczyszczanie tekstu ----
# (Text Preprocessing and Text Cleaning)


# Normalizacja i usunięcie zbędnych znaków ----


# Zapewnienie kodowania w całym korpusie
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to = "UTF-8", sub = "byte")))



# Funkcja do zamiany znaków na spację
toSpace <- content_transformer(function (x, pattern) gsub(pattern, " ", x))


# Usuń zbędne znaki lub pozostałości url, html itp.

# symbol @
corpus <- tm_map(corpus, toSpace, "@")

# symbol @ ze słowem (zazw. nazwa użytkownika)
corpus <- tm_map(corpus, toSpace, "@\\w+")

# linia pionowa
corpus <- tm_map(corpus, toSpace, "\\|")

# tabulatory
corpus <- tm_map(corpus, toSpace, "[ \t]{2,}")

# CAŁY adres URL:
corpus <- tm_map(corpus, toSpace, "(s?)(f|ht)tp(s?)://\\S+\\b")

# http i https
corpus <- tm_map(corpus, toSpace, "http\\w*")

# tylko ukośnik odwrotny (np. po http)
corpus <- tm_map(corpus, toSpace, "/")

# pozostałość po re-tweecie
corpus <- tm_map(corpus, toSpace, "(RT|via)((?:\\b\\W*@\\w+)+)")

# inne pozostałości
corpus <- tm_map(corpus, toSpace, "www")
corpus <- tm_map(corpus, toSpace, "~")
corpus <- tm_map(corpus, toSpace, "–")


# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "Yuichi is searching for a man who pushed his son, in the first-class compartment of the bullet train, but he finds Prince instead and is tasered by her."                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [2] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
## [3] "Ladybug steals the briefcase from Lemon and Tangerine but is attacked by another assassin (this time from Mexico), the \"Wolf\" (Benito A. Martínez Ocasio) (a Mexican assassin and former kingpin of a drug cartel), who blames Ladybug for fatally poisoning his entire wedding party, including his newlywed wife (whom Ladybug had actually saved). Ladybug was about to get off the train at the next station, but was met by Wolf at the door, who attacks Ladybug. Ladybug doesn't recognize the Wolf and denies having poisoned his entire wedding party. But Ladybug was present at the party as a waiter."
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)


# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "                                                                                                                                                                                                                                                                                                                
## [2] ""                                                                                                                                                                                                                                                                                                                                                                                                                 
## [3] "ladybug steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames ladybug fatally poisoning entire wedding party including newlywed wife ladybug actually saved ladybug get train next station met wolf door attacks ladybug ladybug recognize wolf denies poisoned entire wedding party ladybug present party waiter"
# usunięcie ewt. zbędnych nazw własnych
corpus <- tm_map(corpus, removeWords, c("rose", "roses", "kate", "kates", "iris", "tyler", "tylers", "javi", "javis", "reed", "josh", "joshs", "elliot", "elliots", "julian", "julians", "patrick", "patricks", "margot", "margots", "one", "however", "ladybug"))
corpus <- tm_map(corpus, stripWhitespace)

# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "                                                                                                                                                                                                                                                         
## [2] ""                                                                                                                                                                                                                                                                                                                                                          
## [3] " steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames fatally poisoning entire wedding party including newlywed wife actually saved get train next station met wolf door attacks recognize wolf denies poisoned entire wedding party present party waiter"
# Stemming ----

# zachowaj kopię korpusu 
# do użycia jako dictionary w uzupełnianiu rdzeni
corpus_copy <- corpus

# wykonaj stemming w korpusie
corpus_stemmed <- tm_map(corpus, stemDocument)


# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "                                                                                                                                                                                                                                                         
## [2] ""                                                                                                                                                                                                                                                                                                                                                          
## [3] " steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames fatally poisoning entire wedding party including newlywed wife actually saved get train next station met wolf door attacks recognize wolf denies poisoned entire wedding party present party waiter"
# Sprawdzenie
corpus_stemmed[[1]][[1]][7:9]
## [1] "yuichi search man push son firstclass compart bullet train find princ instead taser"                                                                                                                                                                                                                               
## [2] ""                                                                                                                                                                                                                                                                                                                  
## [3] "steal briefcas lemon tangerin attack anoth assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blame fatal poison entir wed parti includ newlyw wife actual save get train next station met wolf door attack recogn wolf deni poison entir wed parti present parti waiter"
# Uzupełnienie rdzeni słów po stemmingu ----

# funkcja pomocnicza: wykonuje stemCompletion linia po linii
complete_stems <- content_transformer(function(x, dict) {
  x <- unlist(strsplit(x, " "))                  # podziel na słowa
  x <- stemCompletion(x, dictionary = corpus_copy, type="longest") # uzupełnij rdzenie
  paste(x, collapse = " ")                       # połącz z powrotem w tekst
})

# wykonaj stemCompletion do każdego dokumentu w korpusie
corpus_completed <- tm_map(corpus_stemmed, complete_stems, dict = corpus_copy)

# usuń NA
corpus_completed <- tm_map(corpus_completed, toSpace, "NA")
corpus_completed <- tm_map(corpus_completed, stripWhitespace)


# Sprawdzenie
corpus_completed[[1]][[1]][1]
## [1] "tokyo yuichis kimura andrew koji seeking revenge sons wataru pushed rooftop told boards bulletproof train later night elders hiroyuki sanada meanwhile guided handler maria sandra bullock former assassinate brad pitt seasoned american assassinate suffering considers assigned retrieve briefcase bulletproof train bound kyoto previously contract carver callrbind sick initially recently stringsasfactors bad lucky jobs resulted accidentally deaths also train younger womans codenamed princes joey kingpin younger womans disguised schoolgirl manipulating assassinate considers attacking wataru two englishman assassinate brothers callrbind lemon brian tyree tangerineslater aaron taylorjohnson assigned escort briefcase ransom money malfunctioning sons sons russianborn yakuza boss known white deaths michael shannon japanese yakuza member minegishis former advisor hired dueling roles jobs bolivia sons logan lerman white deaths sons kidnapped prior eventually film minegishis japanese mob boss white deaths russianborn kgb working ranken eventually white deaths turns minegishis killing entire clan becomes mob boss white deaths wife died driven accidentally yuichis searching manipulating pushed sons firstclass compartment bulletproof train finding princes instead tasered steals briefcase lemon tangerineslater attacking another assassinate time mexico wolfs benito martínez ocasio mexican assassinate former kingpin drug cartel blames fatally poisoning entire wedding including newlywed wife actually saved getting train next station meteorology wolfs doors attacking recognizing wolfs denies poisoning entire wedding present waiter briefcase fighting wolfs knife throw rebounds briefcase heart leading deaths distraught stashes briefcase away arrangements wolfs corpse looking like sleeping passenger meanwhile princes revealing yuichis pushed wataru rooftop lure train partners elaborate planned assassinate white deaths well factorclusters henchman holding wataru hostage hospital orderlies killing anything happening princes knowing yuichis working white deaths wanting assassinate mob boss lemon tangerineslater searching missionaries briefcase white deaths sons poisoning died manner wolfs wedding months prior princes running tangerineslater saying saw briefcase tangerineslater looking tangerineslater wanting attacking meanwhile offers briefcase lemon returning getting train lemon suspects killing white deaths sons falsely admitted dueling lemon specifications knew killing someone believing meant wolfs leading fighting lemon knocking unconscious realization innocent upon awakening princes yuichis finding briefcase boobytrapped explosives killing white deaths well yuichis rigged gun second precaution encounters tangerineslater kicks train another scuffle manages climbs backyard aboard tangerineslater realization briefcase killing white deaths sons willing letter goodbye needed falls guy white deaths sons killing suspicious lemon shooting injuries yuichis finding yuichis princes together figure running shows train getting taken princes innocent schoolgirl activates believing yuichis kidnapped collapses drinking water spiked sleeping drug brought princes shooting lemon stashes yuichis bathroom encounters yeti another assassinate hornet zazie beetz american assassinate specializes poisoning disguised mascot poisoning white deaths sons wolfs wedding modified boomslang venom struggles exposing venom steals antivenom saved leaving died tangerineslater running princes realization shot lemon lemon puts sticker princes bad personally attacking candle shooting accidentally killing tangerineslater gun believing princes innocent agrees protect despite tangerineslater died pleasant yuichis father elders boards train seemingly princes lie recognizing sound voice informs wataru safe undercover bodyguard killing princes operative fleeing elders telling seeking revenge white deaths killing wife taken yakuza clan elders senior position minegishis clan decimated white deaths fate brought together ended discovering yuichis lemon bulletproof vest still alive albeit injuries fourth working together face white deaths kyoto gives briefcase white deaths princes revealing white deaths estranged daughter tries goad shooting yuichis rigged gun fails white deaths explaining assassinate train well sons responsible way deaths wife exception wolfs princes latters hired replace carver killing white deaths wife hired hopes killing white deaths explaining lemon tangerineslater killing crews bolivia goodbye deal sons got arrested white deaths wife went bail meteorology accidentally skilled heart surgeon saved poisoning hornet wife died operative table carver sent assassinate white deaths ended killing wife instead white deaths henchmen opener boobytrapped briefcase explodes knocking white deaths backyard onto train white deaths remaining henchmen boards battle assassinate elders dueling white deaths sword fighting fighting causing train hurtle controlled crashed downtown kyoto emergency wreck elders katana stuck chest white deaths tries killing blown rigged gun princes threatening yuichis elders machine gun proclaiming newspaper white deaths suddenly struck killing passenger fruit truck hauling tangerineslater revealing driven lemon avenging tangerineslater deaths process maria arriving retrieve celebrates finally getting bulletproof train japanese authorities arriving tries cleaning incredible damaged downtown kyoto assassinate exploitation causing"
# Porównaj:
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "                                                                                                                                                                                                                                                         
## [2] ""                                                                                                                                                                                                                                                                                                                                                          
## [3] " steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames fatally poisoning entire wedding party including newlywed wife actually saved get train next station met wolf door attacks recognize wolf denies poisoned entire wedding party present party waiter"
corpus_stemmed[[1]][[1]][7:9]
## [1] "yuichi search man push son firstclass compart bullet train find princ instead taser"                                                                                                                                                                                                                               
## [2] ""                                                                                                                                                                                                                                                                                                                  
## [3] "steal briefcas lemon tangerin attack anoth assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blame fatal poison entir wed parti includ newlyw wife actual save get train next station met wolf door attack recogn wolf deni poison entir wed parti present parti waiter"
# Decyzja dotycząca korpusu ----
# Należy w tym momencie rozważyć, 
# który obiekt użyć do dalszej analizy:
#
# - corpus (oryginalny, bez stemmingu)
# - corpus_stemmed (po stemmingu)
# - corpus_completed (uzupełnione rdzenie)





# Tokenizacja ----


# Macierze częstości TDM i DTM ----


# a) Funkcja TermDocumentMatrix() ----
# tokeny = wiersze, dokumenty = kolumny
tdm <- TermDocumentMatrix(corpus_completed)
tdm
## <<TermDocumentMatrix (terms: 1492, documents: 11)>>
## Non-/sparse entries: 3020/13392
## Sparsity           : 82%
## Maximal term length: 38
## Weighting          : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 1492, documents: 11)>>
## Non-/sparse entries: 3020/13392
## Sparsity           : 82%
## Maximal term length: 38
## Weighting          : term frequency (tf)
## Sample             :
##                         Docs
## Terms                    Action__Bullet_Train2022.txt Action__Twisters2024.txt
##   corpuscompleted                                   0                        0
##   dokumentów                                        0                        0
##   dtmmclusterdocsidx                                0                        0
##   falsely                                           1                        0
##   klaster                                           0                        0
##   klastrów                                          0                        0
##   klastrowaniecluster                               0                        0
##   słów                                              0                        0
##   tmmapcorpuscompleted                              0                        0
##   wordcloudnameswordfreq                            0                        0
##                         Docs
## Terms                    Dark_Comedy__Menu2022.txt Supernatural__Smile2022.txt
##   corpuscompleted                                0                           0
##   dokumentów                                     0                           0
##   dtmmclusterdocsidx                             0                           0
##   falsely                                        0                           0
##   klaster                                        0                           0
##   klastrów                                       0                           0
##   klastrowaniecluster                            0                           0
##   słów                                           0                           0
##   tmmapcorpuscompleted                           0                           0
##   wordcloudnameswordfreq                         0                           0
##                         Docs
## Terms                    Thriller_Psych__Heretic2024.txt
##   corpuscompleted                                      0
##   dokumentów                                           0
##   dtmmclusterdocsidx                                   0
##   falsely                                              0
##   klaster                                              0
##   klastrów                                             0
##   klastrowaniecluster                                  0
##   słów                                                 0
##   tmmapcorpuscompleted                                 0
##   wordcloudnameswordfreq                               0
##                         Docs
## Terms                    Thriller_SciFi__Caddo_Lake2024.txt
##   corpuscompleted                                         0
##   dokumentów                                              0
##   dtmmclusterdocsidx                                      0
##   falsely                                                 0
##   klaster                                                 0
##   klastrów                                                0
##   klastrowaniecluster                                     0
##   słów                                                    0
##   tmmapcorpuscompleted                                    0
##   wordcloudnameswordfreq                                  0
##                         Docs
## Terms                    Thriller_SciFi__Companion2025.txt
##   corpuscompleted                                        0
##   dokumentów                                             0
##   dtmmclusterdocsidx                                     0
##   falsely                                                0
##   klaster                                                0
##   klastrów                                               0
##   klastrowaniecluster                                    0
##   słów                                                   0
##   tmmapcorpuscompleted                                   0
##   wordcloudnameswordfreq                                 0
##                         Docs
## Terms                    Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
##   corpuscompleted                                                                  36
##   dokumentów                                                                       41
##   dtmmclusterdocsidx                                                               19
##   falsely                                                                          19
##   klaster                                                                          24
##   klastrów                                                                         33
##   klastrowaniecluster                                                              34
##   słów                                                                             22
##   tmmapcorpuscompleted                                                             22
##   wordcloudnameswordfreq                                                           34
##                         Docs
## Terms                    Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
##   corpuscompleted                                                                       36
##   dokumentów                                                                            41
##   dtmmclusterdocsidx                                                                    19
##   falsely                                                                               19
##   klaster                                                                               24
##   klastrów                                                                              33
##   klastrowaniecluster                                                                   34
##   słów                                                                                  22
##   tmmapcorpuscompleted                                                                  22
##   wordcloudnameswordfreq                                                                34
##                         Docs
## Terms                    Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
##   corpuscompleted                                                                         36
##   dokumentów                                                                              41
##   dtmmclusterdocsidx                                                                      19
##   falsely                                                                                 19
##   klaster                                                                                 24
##   klastrów                                                                                33
##   klastrowaniecluster                                                                     34
##   słów                                                                                    22
##   tmmapcorpuscompleted                                                                    22
##   wordcloudnameswordfreq                                                                  34
tdm_m <- as.matrix(tdm)

tdm_m[1:5, 1:5]
##             Docs
## Terms        Action__Bullet_Train2022.txt Action__Twisters2024.txt
##   aaron                                 1                        0
##   abandoning                            0                        2
##   abdominal                             0                        0
##   ablaze                                0                        1
##   aboard                                1                        0
##             Docs
## Terms        Dark_Comedy__Menu2022.txt Supernatural__Night_Swim2024.txt
##   aaron                              0                                0
##   abandoning                         0                                0
##   abdominal                          0                                0
##   ablaze                             1                                0
##   aboard                             0                                0
##             Docs
## Terms        Supernatural__Smile2022.txt
##   aaron                                0
##   abandoning                           2
##   abdominal                            0
##   ablaze                               0
##   aboard                               0
# Można zapisać TDM w pliku .csv
# write.csv(tdm_m, file="TDM.csv")


# b) Funkcja DocumentTermMatrix() ----
# dokumenty = wiersze, tokeny = kolumny
dtm <- DocumentTermMatrix(corpus_completed)
dtm
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity           : 82%
## Maximal term length: 38
## Weighting          : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity           : 82%
## Maximal term length: 38
## Weighting          : term frequency (tf)
## Sample             :
##                                                                      Terms
## Docs                                                                  corpuscompleted
##   Action__Bullet_Train2022.txt                                                      0
##   Action__Twisters2024.txt                                                          0
##   Dark_Comedy__Menu2022.txt                                                         0
##   Supernatural__Smile2022.txt                                                       0
##   Thriller_Psych__Heretic2024.txt                                                   0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                0
##   Thriller_SciFi__Companion2025.txt                                                 0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                     36
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                36
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd              36
##                                                                      Terms
## Docs                                                                  dokumentów
##   Action__Bullet_Train2022.txt                                                 0
##   Action__Twisters2024.txt                                                     0
##   Dark_Comedy__Menu2022.txt                                                    0
##   Supernatural__Smile2022.txt                                                  0
##   Thriller_Psych__Heretic2024.txt                                              0
##   Thriller_SciFi__Caddo_Lake2024.txt                                           0
##   Thriller_SciFi__Companion2025.txt                                            0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                41
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R           41
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd         41
##                                                                      Terms
## Docs                                                                  dtmmclusterdocsidx
##   Action__Bullet_Train2022.txt                                                         0
##   Action__Twisters2024.txt                                                             0
##   Dark_Comedy__Menu2022.txt                                                            0
##   Supernatural__Smile2022.txt                                                          0
##   Thriller_Psych__Heretic2024.txt                                                      0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                   0
##   Thriller_SciFi__Companion2025.txt                                                    0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                        19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                   19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                 19
##                                                                      Terms
## Docs                                                                  falsely
##   Action__Bullet_Train2022.txt                                              1
##   Action__Twisters2024.txt                                                  0
##   Dark_Comedy__Menu2022.txt                                                 0
##   Supernatural__Smile2022.txt                                               0
##   Thriller_Psych__Heretic2024.txt                                           0
##   Thriller_SciFi__Caddo_Lake2024.txt                                        0
##   Thriller_SciFi__Companion2025.txt                                         0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R             19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R        19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd      19
##                                                                      Terms
## Docs                                                                  klaster
##   Action__Bullet_Train2022.txt                                              0
##   Action__Twisters2024.txt                                                  0
##   Dark_Comedy__Menu2022.txt                                                 0
##   Supernatural__Smile2022.txt                                               0
##   Thriller_Psych__Heretic2024.txt                                           0
##   Thriller_SciFi__Caddo_Lake2024.txt                                        0
##   Thriller_SciFi__Companion2025.txt                                         0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R             24
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R        24
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd      24
##                                                                      Terms
## Docs                                                                  klastrów
##   Action__Bullet_Train2022.txt                                               0
##   Action__Twisters2024.txt                                                   0
##   Dark_Comedy__Menu2022.txt                                                  0
##   Supernatural__Smile2022.txt                                                0
##   Thriller_Psych__Heretic2024.txt                                            0
##   Thriller_SciFi__Caddo_Lake2024.txt                                         0
##   Thriller_SciFi__Companion2025.txt                                          0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R              33
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R         33
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd       33
##                                                                      Terms
## Docs                                                                  klastrowaniecluster
##   Action__Bullet_Train2022.txt                                                          0
##   Action__Twisters2024.txt                                                              0
##   Dark_Comedy__Menu2022.txt                                                             0
##   Supernatural__Smile2022.txt                                                           0
##   Thriller_Psych__Heretic2024.txt                                                       0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                    0
##   Thriller_SciFi__Companion2025.txt                                                     0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                         34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                    34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                  34
##                                                                      Terms
## Docs                                                                  słów
##   Action__Bullet_Train2022.txt                                           0
##   Action__Twisters2024.txt                                               0
##   Dark_Comedy__Menu2022.txt                                              0
##   Supernatural__Smile2022.txt                                            0
##   Thriller_Psych__Heretic2024.txt                                        0
##   Thriller_SciFi__Caddo_Lake2024.txt                                     0
##   Thriller_SciFi__Companion2025.txt                                      0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R          22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R     22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd   22
##                                                                      Terms
## Docs                                                                  tmmapcorpuscompleted
##   Action__Bullet_Train2022.txt                                                           0
##   Action__Twisters2024.txt                                                               0
##   Dark_Comedy__Menu2022.txt                                                              0
##   Supernatural__Smile2022.txt                                                            0
##   Thriller_Psych__Heretic2024.txt                                                        0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                     0
##   Thriller_SciFi__Companion2025.txt                                                      0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                          22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                     22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                   22
##                                                                      Terms
## Docs                                                                  wordcloudnameswordfreq
##   Action__Bullet_Train2022.txt                                                             0
##   Action__Twisters2024.txt                                                                 0
##   Dark_Comedy__Menu2022.txt                                                                0
##   Supernatural__Smile2022.txt                                                              0
##   Thriller_Psych__Heretic2024.txt                                                          0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                       0
##   Thriller_SciFi__Companion2025.txt                                                        0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                            34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                       34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                     34
dtm_m <- as.matrix(dtm)

dtm_m[1:5, 1:5]
##                                   Terms
## Docs                               aaron abandoning abdominal ablaze aboard
##   Action__Bullet_Train2022.txt         1          0         0      0      1
##   Action__Twisters2024.txt             0          2         0      1      0
##   Dark_Comedy__Menu2022.txt            0          0         0      1      0
##   Supernatural__Night_Swim2024.txt     0          0         0      0      0
##   Supernatural__Smile2022.txt          0          2         0      0      0
# Można zapisać DTM w pliku .csv
# write.csv(dtm_m, file="DTM.csv")



# 2. Zliczanie częstości słów ----
# (Word Frequency Count)

# Można zliczyć same częstości słów w macierzach
# dla TDM i DTM da to identyczny rezultat
v <- sort(rowSums(tdm_m), decreasing = TRUE)
tdm_df <- data.frame(word = names(v), freq = v)
head(tdm_df, 10)
v2 <- sort(colSums(dtm_m), decreasing = TRUE)
dtm_df <- data.frame(word = names(v2), freq = v2)
head(dtm_df, 10)
# 3. Eksploracyjna analiza danych ----
# (Exploratory Data Analysis, EDA)


# Chmura słów (globalna)
wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, 
          colors = brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## corpuscompleted could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## zapewnienie could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## thememinimalbasesize could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## topwords could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## pastenameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dokumentów could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## interaktywna could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## sprawdzenie could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## klastrów could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## documentnames could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## textfolder could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dokumentom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## true could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## removepunctuation could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## wordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## house could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## przypisania could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## tmmapcorpuscompleted could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## girlfriend could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## listpagelength could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## carefully could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## neardeath could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## drops could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## plików could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## lengthclusterdocsidx could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## leftjoindocumentsclusters could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## clusterinfodf could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## deaths could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## colsumsclusterdocs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dtmmclusterdocsidx could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## finding could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## stopwordsenglish could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## attacking could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## klastrowaniecluster could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dla could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## fvizclusterlistdata could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## killing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## klastra could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## stemmingu could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## corpusstemmed could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## previously could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## termdocumentmatrixcorpuscompleted could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## documentsclusterszinfo could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## settings could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## princes could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## przetwarzanie could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## ustaw could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## kmeansdtmm could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## tokenizacja could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## pełnym could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## docnames could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## printdocumentsclusters could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## tangerineslater could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## briefcase could not be fit on page. It will not be plotted.
# Wyświetl top 10
print(head(tdm_df, 10))
##                                          word freq
## dokumentów                         dokumentów  123
## corpuscompleted               corpuscompleted  108
## klastrowaniecluster       klastrowaniecluster  102
## wordcloudnameswordfreq wordcloudnameswordfreq  102
## klastrów                             klastrów   99
## klaster                               klaster   72
## słów                                     słów   66
## tmmapcorpuscompleted     tmmapcorpuscompleted   66
## falsely                               falsely   58
## dtmmclusterdocsidx         dtmmclusterdocsidx   57
# 4. Inżynieria cech w modelu Bag of Words: ----
# Reprezentacja słów i dokumentów w przestrzeni wektorowej ----
# (Feature Engineering in vector-space BoW model)


# - podejście surowych częstości słów
# (częstość słowa = liczba wystąpień w dokumencie)
# (Raw Word Counts)



# Użyj utworzonej wcześniej macierzy DTM
dtm
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity           : 82%
## Maximal term length: 38
## Weighting          : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity           : 82%
## Maximal term length: 38
## Weighting          : term frequency (tf)
## Sample             :
##                                                                      Terms
## Docs                                                                  corpuscompleted
##   Action__Bullet_Train2022.txt                                                      0
##   Action__Twisters2024.txt                                                          0
##   Dark_Comedy__Menu2022.txt                                                         0
##   Supernatural__Smile2022.txt                                                       0
##   Thriller_Psych__Heretic2024.txt                                                   0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                0
##   Thriller_SciFi__Companion2025.txt                                                 0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                     36
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                36
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd              36
##                                                                      Terms
## Docs                                                                  dokumentów
##   Action__Bullet_Train2022.txt                                                 0
##   Action__Twisters2024.txt                                                     0
##   Dark_Comedy__Menu2022.txt                                                    0
##   Supernatural__Smile2022.txt                                                  0
##   Thriller_Psych__Heretic2024.txt                                              0
##   Thriller_SciFi__Caddo_Lake2024.txt                                           0
##   Thriller_SciFi__Companion2025.txt                                            0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                41
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R           41
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd         41
##                                                                      Terms
## Docs                                                                  dtmmclusterdocsidx
##   Action__Bullet_Train2022.txt                                                         0
##   Action__Twisters2024.txt                                                             0
##   Dark_Comedy__Menu2022.txt                                                            0
##   Supernatural__Smile2022.txt                                                          0
##   Thriller_Psych__Heretic2024.txt                                                      0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                   0
##   Thriller_SciFi__Companion2025.txt                                                    0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                        19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                   19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                 19
##                                                                      Terms
## Docs                                                                  falsely
##   Action__Bullet_Train2022.txt                                              1
##   Action__Twisters2024.txt                                                  0
##   Dark_Comedy__Menu2022.txt                                                 0
##   Supernatural__Smile2022.txt                                               0
##   Thriller_Psych__Heretic2024.txt                                           0
##   Thriller_SciFi__Caddo_Lake2024.txt                                        0
##   Thriller_SciFi__Companion2025.txt                                         0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R             19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R        19
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd      19
##                                                                      Terms
## Docs                                                                  klaster
##   Action__Bullet_Train2022.txt                                              0
##   Action__Twisters2024.txt                                                  0
##   Dark_Comedy__Menu2022.txt                                                 0
##   Supernatural__Smile2022.txt                                               0
##   Thriller_Psych__Heretic2024.txt                                           0
##   Thriller_SciFi__Caddo_Lake2024.txt                                        0
##   Thriller_SciFi__Companion2025.txt                                         0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R             24
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R        24
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd      24
##                                                                      Terms
## Docs                                                                  klastrów
##   Action__Bullet_Train2022.txt                                               0
##   Action__Twisters2024.txt                                                   0
##   Dark_Comedy__Menu2022.txt                                                  0
##   Supernatural__Smile2022.txt                                                0
##   Thriller_Psych__Heretic2024.txt                                            0
##   Thriller_SciFi__Caddo_Lake2024.txt                                         0
##   Thriller_SciFi__Companion2025.txt                                          0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R              33
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R         33
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd       33
##                                                                      Terms
## Docs                                                                  klastrowaniecluster
##   Action__Bullet_Train2022.txt                                                          0
##   Action__Twisters2024.txt                                                              0
##   Dark_Comedy__Menu2022.txt                                                             0
##   Supernatural__Smile2022.txt                                                           0
##   Thriller_Psych__Heretic2024.txt                                                       0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                    0
##   Thriller_SciFi__Companion2025.txt                                                     0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                         34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                    34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                  34
##                                                                      Terms
## Docs                                                                  słów
##   Action__Bullet_Train2022.txt                                           0
##   Action__Twisters2024.txt                                               0
##   Dark_Comedy__Menu2022.txt                                              0
##   Supernatural__Smile2022.txt                                            0
##   Thriller_Psych__Heretic2024.txt                                        0
##   Thriller_SciFi__Caddo_Lake2024.txt                                     0
##   Thriller_SciFi__Companion2025.txt                                      0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R          22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R     22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd   22
##                                                                      Terms
## Docs                                                                  tmmapcorpuscompleted
##   Action__Bullet_Train2022.txt                                                           0
##   Action__Twisters2024.txt                                                               0
##   Dark_Comedy__Menu2022.txt                                                              0
##   Supernatural__Smile2022.txt                                                            0
##   Thriller_Psych__Heretic2024.txt                                                        0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                     0
##   Thriller_SciFi__Companion2025.txt                                                      0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                          22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                     22
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                   22
##                                                                      Terms
## Docs                                                                  wordcloudnameswordfreq
##   Action__Bullet_Train2022.txt                                                             0
##   Action__Twisters2024.txt                                                                 0
##   Dark_Comedy__Menu2022.txt                                                                0
##   Supernatural__Smile2022.txt                                                              0
##   Thriller_Psych__Heretic2024.txt                                                          0
##   Thriller_SciFi__Caddo_Lake2024.txt                                                       0
##   Thriller_SciFi__Companion2025.txt                                                        0
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R                            34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R                       34
##   Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd                     34
dtm_m[1:5, 1:5]
##                                   Terms
## Docs                               aaron abandoning abdominal ablaze aboard
##   Action__Bullet_Train2022.txt         1          0         0      0      1
##   Action__Twisters2024.txt             0          2         0      1      0
##   Dark_Comedy__Menu2022.txt            0          0         0      1      0
##   Supernatural__Night_Swim2024.txt     0          0         0      0      0
##   Supernatural__Smile2022.txt          0          2         0      0      0
# UCZENIE MASZYNOWE NIENADZOROWANE ----
# (Unsupervised Machine Learning)



# Klastrowanie k-średnich (k-means) ----


# Dobór liczby klastrów
# Metoda sylwetki (silhouette)
fviz_nbclust(t(dtm_m), kmeans, method = "silhouette") +
  labs(title = "Dobór liczby klastrów", subtitle = "Metoda sylwetki")

# Wykonaj klastrowanie kmeans
# (sprawdź wyniki dla k = 3,4,5)
set.seed(123) # ziarno losowe dla replikacji wyników



# a) Ustaw liczbę klastrów k = 2 ----
k <- 2 # ustaw liczbę klastrów


klastrowanie <- kmeans(dtm_m, centers = k)


# Wizualizacja klastrów
fviz_cluster(list(data = dtm_m, cluster = klastrowanie$cluster),
             geom = "point",
             main = "Wizualizacja klastrów dokumentów")

# Interaktywna tabela z przypisaniem dokumentów i top 5 słów
# Dla każdego klastra: liczba dokumentów oraz top 5 słów
cluster_info <- lapply(1:k, function(i) {
  cluster_docs_idx <- which(klastrowanie$cluster == i)
  cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
  word_freq <- sort(colSums(cluster_docs), decreasing = TRUE)
  top_words <- paste(names(word_freq)[1:5], collapse = ", ")
  data.frame(
    Klaster = i,
    Liczba_dokumentów = length(cluster_docs_idx),
    Top_5_słów = top_words,
    stringsAsFactors = FALSE
  )
})

# Połącz wszystko w ramkę danych
cluster_info_df <- do.call(rbind, cluster_info)

# Nazwy dokumentów z korpusu
document_names <- names(corpus)

# Tabela przypisania dokumentów do klastrów
documents_clusters <- data.frame(
  Dokument = document_names,
  Klaster = klastrowanie$cluster,
  stringsAsFactors = FALSE
)

# Dołączamy dane z podsumowania (JOIN po klastrze)
documents_clusters_z_info <- left_join(documents_clusters, cluster_info_df, by = "Klaster")

# Interaktywna tabela z pełnym podsumowaniem
datatable(documents_clusters_z_info,
          caption = "Dokumenty, klastry, najczęstsze słowa i liczność klastrów",
          rownames = FALSE,
          options = list(pageLength = 10))
# Chmury słów dla każdego klastra
for (i in 1:k) {
  # znajdź indeksy dokumentów w danym klastrze
  cluster_docs_idx <- which(klastrowanie$cluster == i)
  
  # nazwy plików odpowiadające dokumentom w tym klastrze
  doc_names <- names(klastrowanie$cluster)[cluster_docs_idx]
  
    # generuj chmurę słów dla klastra
  cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
  word_freq <- colSums(cluster_docs)
  wordcloud(names(word_freq), freq = word_freq, 
            max.words = 15, colors = brewer.pal(8, "Dark2"))
  title(paste("Chmura słów - Klaster", i))
}

## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## klastrowaniecluster could not be fit on page. It will not be plotted.

# a) Przypisanie dokumentów do klastrów ----
document_names <- names(corpus)  # Nazwy dokumentów z korpusu
clusters <- klastrowanie$cluster  # Przypisanie dokumentów do klastrów

# Ramka danych: dokumenty i ich klastry
documents_clusters <- data.frame(Dokument = document_names,
                                 Klaster = as.factor(clusters))

# Podgląd
print(documents_clusters)
##                                                                                                                                Dokument
## Action__Bullet_Train2022.txt                                                                               Action__Bullet_Train2022.txt
## Action__Twisters2024.txt                                                                                       Action__Twisters2024.txt
## Dark_Comedy__Menu2022.txt                                                                                     Dark_Comedy__Menu2022.txt
## Supernatural__Night_Swim2024.txt                                                                       Supernatural__Night_Swim2024.txt
## Supernatural__Smile2022.txt                                                                                 Supernatural__Smile2022.txt
## Thriller_Psych__Heretic2024.txt                                                                         Thriller_Psych__Heretic2024.txt
## Thriller_SciFi__Caddo_Lake2024.txt                                                                   Thriller_SciFi__Caddo_Lake2024.txt
## Thriller_SciFi__Companion2025.txt                                                                     Thriller_SciFi__Companion2025.txt
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R               Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R     Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
##                                                                     Klaster
## Action__Bullet_Train2022.txt                                              1
## Action__Twisters2024.txt                                                  1
## Dark_Comedy__Menu2022.txt                                                 1
## Supernatural__Night_Swim2024.txt                                          1
## Supernatural__Smile2022.txt                                               1
## Thriller_Psych__Heretic2024.txt                                           1
## Thriller_SciFi__Caddo_Lake2024.txt                                        1
## Thriller_SciFi__Companion2025.txt                                         1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R              2
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R         2
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd       2
# a) Wizualizacja przypisania dokumentów do klastrów ----
ggplot(documents_clusters, aes(x = reorder(Dokument, Klaster), fill = Klaster)) +
  geom_bar(stat = "count", width = 0.7) +
  coord_flip() +
  labs(title = "Przypisanie dokumentów do klastrów",
       x = "Dokument",
       y = "Liczba wystąpień (powinna wynosić 1)",
       fill = "Klaster") +
  theme_minimal(base_size = 13)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

# b) Ustaw liczbę klastrów k = 3 ----
k <- 3 # ustaw liczbę klastrów


klastrowanie <- kmeans(dtm_m, centers = k)


# Wizualizacja klastrów
fviz_cluster(list(data = dtm_m, cluster = klastrowanie$cluster),
             geom = "point",
             main = "Wizualizacja klastrów dokumentów")

# Interaktywna tabela z przypisaniem dokumentów i top 5 słów
# Dla każdego klastra: liczba dokumentów oraz top 5 słów
cluster_info <- lapply(1:k, function(i) {
  cluster_docs_idx <- which(klastrowanie$cluster == i)
  cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
  word_freq <- sort(colSums(cluster_docs), decreasing = TRUE)
  top_words <- paste(names(word_freq)[1:5], collapse = ", ")
  data.frame(
    Klaster = i,
    Liczba_dokumentów = length(cluster_docs_idx),
    Top_5_słów = top_words,
    stringsAsFactors = FALSE
  )
})

# Połącz wszystko w ramkę danych
cluster_info_df <- do.call(rbind, cluster_info)

# Nazwy dokumentów z korpusu
document_names <- names(corpus)

# Tabela przypisania dokumentów do klastrów
documents_clusters <- data.frame(
  Dokument = document_names,
  Klaster = klastrowanie$cluster,
  stringsAsFactors = FALSE
)

# Dołączamy dane z podsumowania (JOIN po klastrze)
documents_clusters_z_info <- left_join(documents_clusters, cluster_info_df, by = "Klaster")

# Interaktywna tabela z pełnym podsumowaniem
datatable(documents_clusters_z_info,
          caption = "Dokumenty, klastry, najczęstsze słowa i liczność klastrów",
          rownames = FALSE,
          options = list(pageLength = 10))
# Chmury słów dla każdego klastra
for (i in 1:k) {
  # znajdź indeksy dokumentów w danym klastrze
  cluster_docs_idx <- which(klastrowanie$cluster == i)
  
  # nazwy plików odpowiadające dokumentom w tym klastrze
  doc_names <- names(klastrowanie$cluster)[cluster_docs_idx]
  
  # generuj chmurę słów dla klastra
  cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
  word_freq <- colSums(cluster_docs)
  wordcloud(names(word_freq), freq = word_freq, 
            max.words = 15, colors = brewer.pal(8, "Dark2"))
  title(paste("Chmura słów - Klaster", i))
}

## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## klastrowaniecluster could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## corpuscompleted could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## dokumentów could not be fit on page. It will not be plotted.

# b) Przypisanie dokumentów do klastrów ----
document_names <- names(corpus)  # Nazwy dokumentów z korpusu
clusters <- klastrowanie$cluster  # Przypisanie dokumentów do klastrów

# Ramka danych: dokumenty i ich klastry
documents_clusters <- data.frame(Dokument = document_names,
                                 Klaster = as.factor(clusters))

# Podgląd
print(documents_clusters)
##                                                                                                                                Dokument
## Action__Bullet_Train2022.txt                                                                               Action__Bullet_Train2022.txt
## Action__Twisters2024.txt                                                                                       Action__Twisters2024.txt
## Dark_Comedy__Menu2022.txt                                                                                     Dark_Comedy__Menu2022.txt
## Supernatural__Night_Swim2024.txt                                                                       Supernatural__Night_Swim2024.txt
## Supernatural__Smile2022.txt                                                                                 Supernatural__Smile2022.txt
## Thriller_Psych__Heretic2024.txt                                                                         Thriller_Psych__Heretic2024.txt
## Thriller_SciFi__Caddo_Lake2024.txt                                                                   Thriller_SciFi__Caddo_Lake2024.txt
## Thriller_SciFi__Companion2025.txt                                                                     Thriller_SciFi__Companion2025.txt
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R               Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R     Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
##                                                                     Klaster
## Action__Bullet_Train2022.txt                                              2
## Action__Twisters2024.txt                                                  1
## Dark_Comedy__Menu2022.txt                                                 1
## Supernatural__Night_Swim2024.txt                                          1
## Supernatural__Smile2022.txt                                               1
## Thriller_Psych__Heretic2024.txt                                           1
## Thriller_SciFi__Caddo_Lake2024.txt                                        1
## Thriller_SciFi__Companion2025.txt                                         1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R              3
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R         3
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd       3
# b) Wizualizacja przypisania dokumentów do klastrów ----
ggplot(documents_clusters, aes(x = reorder(Dokument, Klaster), fill = Klaster)) +
  geom_bar(stat = "count", width = 0.7) +
  coord_flip() +
  labs(title = "Przypisanie dokumentów do klastrów",
       x = "Dokument",
       y = "Liczba wystąpień (powinna wynosić 1)",
       fill = "Klaster") +
  theme_minimal(base_size = 13)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

# c) Ustaw liczbę klastrów k = 4 ----
k <- 4 # ustaw liczbę klastrów


klastrowanie <- kmeans(dtm_m, centers = k)


# Wizualizacja klastrów
fviz_cluster(list(data = dtm_m, cluster = klastrowanie$cluster),
             geom = "point",
             main = "Wizualizacja klastrów dokumentów")

# Interaktywna tabela z przypisaniem dokumentów i top 5 słów
# Dla każdego klastra: liczba dokumentów oraz top 5 słów
cluster_info <- lapply(1:k, function(i) {
  cluster_docs_idx <- which(klastrowanie$cluster == i)
  cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
  word_freq <- sort(colSums(cluster_docs), decreasing = TRUE)
  top_words <- paste(names(word_freq)[1:5], collapse = ", ")
  data.frame(
    Klaster = i,
    Liczba_dokumentów = length(cluster_docs_idx),
    Top_5_słów = top_words,
    stringsAsFactors = FALSE
  )
})

# Połącz wszystko w ramkę danych
cluster_info_df <- do.call(rbind, cluster_info)

# Nazwy dokumentów z korpusu
document_names <- names(corpus)

# Tabela przypisania dokumentów do klastrów
documents_clusters <- data.frame(
  Dokument = document_names,
  Klaster = klastrowanie$cluster,
  stringsAsFactors = FALSE
)

# Dołączamy dane z podsumowania (JOIN po klastrze)
documents_clusters_z_info <- left_join(documents_clusters, cluster_info_df, by = "Klaster")

# Interaktywna tabela z pełnym podsumowaniem
datatable(documents_clusters_z_info,
          caption = "Dokumenty, klastry, najczęstsze słowa i liczność klastrów",
          rownames = FALSE,
          options = list(pageLength = 10))
# Chmury słów dla każdego klastra
for (i in 1:k) {
  # znajdź indeksy dokumentów w danym klastrze
  cluster_docs_idx <- which(klastrowanie$cluster == i)
  
  # nazwy plików odpowiadające dokumentom w tym klastrze
  doc_names <- names(klastrowanie$cluster)[cluster_docs_idx]
  
  # generuj chmurę słów dla klastra
  cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
  word_freq <- colSums(cluster_docs)
  wordcloud(names(word_freq), freq = word_freq, 
            max.words = 15, colors = brewer.pal(8, "Dark2"))
  title(paste("Chmura słów - Klaster", i))
}
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## klastrowaniecluster could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## returning could not be fit on page. It will not be plotted.

# c) Przypisanie dokumentów do klastrów ----
document_names <- names(corpus)  # Nazwy dokumentów z korpusu
clusters <- klastrowanie$cluster  # Przypisanie dokumentów do klastrów

# Ramka danych: dokumenty i ich klastry
documents_clusters <- data.frame(Dokument = document_names,
                                 Klaster = as.factor(clusters))

# Podgląd
print(documents_clusters)
##                                                                                                                                Dokument
## Action__Bullet_Train2022.txt                                                                               Action__Bullet_Train2022.txt
## Action__Twisters2024.txt                                                                                       Action__Twisters2024.txt
## Dark_Comedy__Menu2022.txt                                                                                     Dark_Comedy__Menu2022.txt
## Supernatural__Night_Swim2024.txt                                                                       Supernatural__Night_Swim2024.txt
## Supernatural__Smile2022.txt                                                                                 Supernatural__Smile2022.txt
## Thriller_Psych__Heretic2024.txt                                                                         Thriller_Psych__Heretic2024.txt
## Thriller_SciFi__Caddo_Lake2024.txt                                                                   Thriller_SciFi__Caddo_Lake2024.txt
## Thriller_SciFi__Companion2025.txt                                                                     Thriller_SciFi__Companion2025.txt
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R               Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R     Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
##                                                                     Klaster
## Action__Bullet_Train2022.txt                                              4
## Action__Twisters2024.txt                                                  3
## Dark_Comedy__Menu2022.txt                                                 2
## Supernatural__Night_Swim2024.txt                                          2
## Supernatural__Smile2022.txt                                               2
## Thriller_Psych__Heretic2024.txt                                           2
## Thriller_SciFi__Caddo_Lake2024.txt                                        2
## Thriller_SciFi__Companion2025.txt                                         2
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R              1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R         1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd       1
# c) Wizualizacja przypisania dokumentów do klastrów ----
ggplot(documents_clusters, aes(x = reorder(Dokument, Klaster), fill = Klaster)) +
  geom_bar(stat = "count", width = 0.7) +
  coord_flip() +
  labs(title = "Przypisanie dokumentów do klastrów",
       x = "Dokument",
       y = "Liczba wystąpień (powinna wynosić 1)",
       fill = "Klaster") +
  theme_minimal(base_size = 13)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA